import os
from langchain_unstructured import UnstructuredLoader

os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
os.environ["TESSDATA_PREFIX"] = r"D:\01_software\27_pdf_unstructured\tesseract\tessdata"
os.environ["PATH"] += r";D:\01_software\27_pdf_unstructured\tesseract\\"
os.environ["OCR_AGENT"] = r"D:\01_software\27_pdf_unstructured\tesseract\\"
file_path = r'D:\00_study\07_python\PythonProject\src\document_loaders\examples\layout-parser-paper.pdf'
loader_local = UnstructuredLoader(
    file_path=file_path,
    strategy="hi_res",
    ocr_languages="eng+chi_sim"  # 添加OCR语言支持
)
docs_local = []
for doc in loader_local.lazy_load():
    docs_local.append(doc)
print(len(docs_local))
doc_count = 1;
first_page_docs = [doc for doc in docs_local if doc.metadata.get("page_number") == 1]
for doc in first_page_docs:
    print(f"第{doc_count}个doc开始！")
    doc_count += 1
    print(doc.page_content)